{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2a72501d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b265db7",
   "metadata": {},
   "source": [
    "## 抓取页面信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1f5bf2f2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "range(0, 250, 25)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "start = range(0, 250, 25)\n",
    "start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7d357392",
   "metadata": {},
   "outputs": [],
   "source": [
    "html = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9a219d19",
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_html():\n",
    "    for si in start:\n",
    "        url = f\"https://movie.douban.com/top250?start={si}\"\n",
    "        headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36\"}\n",
    "        res = requests.get(url, headers=headers)\n",
    "        if res.status_code != 200:\n",
    "            raise Exception(\"error\")\n",
    "        html.append(res.text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18b0b1ac",
   "metadata": {},
   "source": [
    "## 解析页面元素"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c8b90749",
   "metadata": {},
   "outputs": [],
   "source": [
    "download_html()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bd0e82eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_html(html):\n",
    "    href = []\n",
    "    title = []\n",
    "    director = []\n",
    "    starring = []\n",
    "    year = []\n",
    "    country = []\n",
    "    style = []\n",
    "    rating_num = []\n",
    "    evaluate_num = []\n",
    "    quote = []\n",
    "    for hi in html:\n",
    "        bs = BeautifulSoup(hi, \"html.parser\")\n",
    "        info = bs.find_all(\"div\", class_=\"info\")\n",
    "        for infoi in info:\n",
    "            href.append(infoi.a.attrs['href'])\n",
    "            title.append(infoi.span.text)\n",
    "            content = infoi.p.text.strip().split('\\n')\n",
    "#             people = re.search(r'导演: (.*)\\xa0\\xa0\\xa0主演: (.*)', content[0])\n",
    "#             director.append(people.group(1))\n",
    "#             starring.append(people.group(2))\n",
    "            information = re.search(r'(.*)\\xa0/\\xa0(.*)\\xa0/\\xa0(.*)', content[1].strip())\n",
    "            year.append(information.group(1))\n",
    "            country.append(information.group(2))\n",
    "            style.append(information.group(3))\n",
    "            rating_num.append(infoi.find('span', class_='rating_num').text)\n",
    "            evaluate_num.append(int(infoi.find('div', class_='star').find_all('span')[-1].text[:-3]))\n",
    "            try:\n",
    "                quote.append(infoi.find('span', class_='inq').text)\n",
    "            except:\n",
    "                quote.append('')\n",
    "    return [href, title, year, country, style, rating_num, evaluate_num, quote]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20dfe8c8",
   "metadata": {},
   "source": [
    "## 保存数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "549b9388",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = parse_html(html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "44f61faf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_data(data):\n",
    "    [href, title, year, country, style, rating_num, evaluate_num, quote] = data\n",
    "    df = pd.DataFrame()\n",
    "    df['title'] = title\n",
    "    df['year'] = year\n",
    "    df['country'] = country\n",
    "    df['style'] = style\n",
    "    df['rating_num'] = rating_num\n",
    "    df['evaluate_num'] = evaluate_num\n",
    "    df['quote'] = quote\n",
    "    df['href'] = href\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "251a6161",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df = save_data(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "afacfe35",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>year</th>\n",
       "      <th>country</th>\n",
       "      <th>style</th>\n",
       "      <th>rating_num</th>\n",
       "      <th>evaluate_num</th>\n",
       "      <th>quote</th>\n",
       "      <th>href</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>肖申克的救赎</td>\n",
       "      <td>1994</td>\n",
       "      <td>美国</td>\n",
       "      <td>犯罪 剧情</td>\n",
       "      <td>9.7</td>\n",
       "      <td>2587809</td>\n",
       "      <td>希望让人自由。</td>\n",
       "      <td>https://movie.douban.com/subject/1292052/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>霸王别姬</td>\n",
       "      <td>1993</td>\n",
       "      <td>中国大陆 中国香港</td>\n",
       "      <td>剧情 爱情 同性</td>\n",
       "      <td>9.6</td>\n",
       "      <td>1921783</td>\n",
       "      <td>风华绝代。</td>\n",
       "      <td>https://movie.douban.com/subject/1291546/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>阿甘正传</td>\n",
       "      <td>1994</td>\n",
       "      <td>美国</td>\n",
       "      <td>剧情 爱情</td>\n",
       "      <td>9.5</td>\n",
       "      <td>1944546</td>\n",
       "      <td>一部美国近现代史。</td>\n",
       "      <td>https://movie.douban.com/subject/1292720/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>泰坦尼克号</td>\n",
       "      <td>1997</td>\n",
       "      <td>美国 墨西哥 澳大利亚 加拿大</td>\n",
       "      <td>剧情 爱情 灾难</td>\n",
       "      <td>9.4</td>\n",
       "      <td>1905861</td>\n",
       "      <td>失去的才是永恒的。</td>\n",
       "      <td>https://movie.douban.com/subject/1292722/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>这个杀手不太冷</td>\n",
       "      <td>1994</td>\n",
       "      <td>法国 美国</td>\n",
       "      <td>剧情 动作 犯罪</td>\n",
       "      <td>9.4</td>\n",
       "      <td>2099488</td>\n",
       "      <td>怪蜀黍和小萝莉不得不说的故事。</td>\n",
       "      <td>https://movie.douban.com/subject/1295644/</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     title  year          country     style rating_num  evaluate_num  \\\n",
       "0   肖申克的救赎  1994               美国     犯罪 剧情        9.7       2587809   \n",
       "1     霸王别姬  1993        中国大陆 中国香港  剧情 爱情 同性        9.6       1921783   \n",
       "2     阿甘正传  1994               美国     剧情 爱情        9.5       1944546   \n",
       "3    泰坦尼克号  1997  美国 墨西哥 澳大利亚 加拿大  剧情 爱情 灾难        9.4       1905861   \n",
       "4  这个杀手不太冷  1994            法国 美国  剧情 动作 犯罪        9.4       2099488   \n",
       "\n",
       "             quote                                       href  \n",
       "0          希望让人自由。  https://movie.douban.com/subject/1292052/  \n",
       "1            风华绝代。  https://movie.douban.com/subject/1291546/  \n",
       "2        一部美国近现代史。  https://movie.douban.com/subject/1292720/  \n",
       "3       失去的才是永恒的。   https://movie.douban.com/subject/1292722/  \n",
       "4  怪蜀黍和小萝莉不得不说的故事。  https://movie.douban.com/subject/1295644/  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "944fd9d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('./top250.csv', encoding='utf_8_sig')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5362ebd0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
